#coding=UTF-8
import jieba,requests
from bs4 import BeautifulSoup
from collections import Counter

HEADERS = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
          ,'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' # 客户端能够接收的内容类型
          ,'Accept-Language': 'en-US,en;q=0.5' # 浏览器可接受的语言
          ,'Connection': 'keep-alive' # 表示是否需要持久连接
          }
ARTICLE_URLS = []

def getArticleUrlList():
    a_f = open('../get_pic_from_weixin/articleUrlList.txt','r',encoding='utf-8')
    for l in a_f.readlines():
        ARTICLE_URLS.append(l.replace('\n',''))
    a_f.close()

#访问文章 by url
def requestHtml(_url):
    response = requests.get(_url,headers=HEADERS)
    return response.content

#从html中抽取标题
'''<meta property="og:title" content="请留意身边这样的......">'''
def getTheTileOfArticleFromHtml(_html):
    pass
#从html文本中抽取内容（中文内容）
def getContentFromHtml(_html):
    return BeautifulSoup(_html,'html.parser').text


#统计文本词频
def analyzeArticle(_content):
    seg_list = jieba.cut(_content)
    c = Counter()
    for x in seg_list:
        if len(x)>1 and x != '\r\n':
            c[x] += 1
    for k,v in c.most_common(20):
        print('%s%s %s %d' % (' '*(5-len(k)),k,'*'*int(v/3),v))


# 遍历文章url 获取文章文字内容 解析文章

if __name__ == '__main__':
    #txt = open('content.txt','r',encoding='utf8').read() 
    #analyzeArticle(txt)
    getArticleUrlList()
    for _u in ARTICLE_URLS:
        html = requestHtml(_u)
        print('常用词统计结果：<<%s>>' % '文章名待定')
        analyzeArticle(getContentFromHtml(html))